* Make dataset with number of patents by applicants, patent office and year
* Only for applicants with weights


cd /Users/fcoell/Dropbox/PATSTAT/DATA
global tmp "/tmp"
global dta "/Users/fcoell/Dropbox/PATSTAT_spring2015/"


* ----------------------------------------------
* Preliminaries : Find relevant patents in post-period
* ----------------------------------------------
use "$dta/tls201/tls201", clear
gen y = substr(appln_filing_date,1,4)
destring y, replace
drop if y<1965 | y==9999 //(9999=unknown year)
keep appln_id appln_auth y
save $tmp/patents, replace

* ----------------------------------------------
* Preliminaries: Save smaller version of 906 database
* ----------------------------------------------
use $dta/tls906/tls906_all, clear
keep person_id person_ctry_code hrm_l2_id han_id
* Identify headquarter as the most frequent country of residence
replace person_ctry_code = trim(person_ctry_code)
egen headq=mode( person_ctry_code), by( hrm_l2_id ) minmode
save $tmp/906, replace

* ----------------------------------------------
* Preliminaries: Granted patents
* ----------------------------------------------
use "$dta/tls211/tls211", clear
keep appln_id publn_first_grant
duplicates examples appln
collapse (max) publn, by(appln)
save $tmp/granted, replace

* ----------------------------------------------
* Preliminaries: NACE code
* ----------------------------------------------

use pat_hldr_allperiod_2, clear
contract hrm nace
drop _freq
save $tmp/hrm_nace, replace

* ----------------------------------------------
* Find hrm id's of firms we have pre-period weights for
* ----------------------------------------------
use weightswic3, clear 
contract hrm headq
keep hrm headq
save $tmp/applicants, replace

* ----------------------------------------------
* Find all applicants of relevant patents
* (note: potentially many applicants per patent)
* ----------------------------------------------

use "$dta/tls207/tls207", clear
keep if applt_seq_nr>0
merge m:1 appln_id using $tmp/patents, keep(match) 
drop _merge

* Find harmonized name
merge m:1 person_id using $tmp/906, keep(match) 
drop _merge

save pat_hldr3, replace

* ----------------------------------------------
* Count the number of inventors for relevant patents
* ----------------------------------------------
use "$dta/tls207/tls207", clear
keep if invt_seq_nr>0
merge m:1 appln_id using $tmp/patents, keep(match) 
drop _merge
collapse (count) num_inventors=invt, by(appln_id)
save inventors, replace

* ----------------------------------------------
* Count number of patents each firm and year
* ----------------------------------------------
use pat_hldr3, clear
drop if headq==""

* If the same firm is associated with the same application id twice or more, we drop those obs
by hrm headq appln_auth appln_id, sort: g unq=_n==1
drop if unq==0

* Get citations per application
ren appln_id cited_appln_id
merge m:1 cited_appln_id using forward_citations, keep(match master)
replace citations=0 if _merge==1
replace citations3year=0 if _merge==1
ren cited_appln_id appln_id 
drop _merge

* Get number of IPC codes per application
merge m:1 appln_id using ipc_codes, keep(match master)
drop _merge
// Note: number of IPC codes missing for 2% of applications 

* Get number of CPC codes per application
merge m:1 appln_id using cpc_codes, keep(match master)
drop _merge
// Note: number of CPC codes missing for 35% of applications 

* Get number of inventors
merge m:1 appln_id using inventors, keep(match master)
drop _merge
* Note: Missing # inventors for 10% of applications
 
* Get patent families
merge m:1 appln_id using "$dta/tls218/tls218", keep(match master)
drop _merge

* Identify granted patents
merge m:1 appln_id using "$tmp/granted", keep(match master)
drop _merge
* Note: publn_first_grant=1 if patent is granted

* Identify triadic patents
gen inEP = appln_auth=="EP" | appln_auth=="AT" | appln_auth=="BE" | appln_auth=="CH"| appln_auth=="DK" ///
| appln_auth=="ES"| appln_auth=="FI"| appln_auth=="IT"| appln_auth=="DE" | appln_auth=="FR"| appln_auth=="GB"

gen inUS = appln_auth=="US"
gen inJP = appln_auth=="JP"

* Identify unique countries patents are filed to
sort hrm headq docdb appln_auth y
by hrm headq docdb appln_auth, sort: gen unqmkt = _n == 1

collapse (sum) mkts=unqmkt citations* (mean) num_inventors num_ipc num_cpc /// 
(min) y (max) granted=publn_first_grant (max) inEP inUS inJP, by(hrm headq docdb)
label var mkts "Number of countries the patent is filed"

save patents_docdb, replace

gen triadic = inEP & inUS & inJP
collapse (count) p=docdb (sum) granted (sum) triadic (mean) mkts citations* num_inventors num_ipc num_cpc, by(hrm headq y)

label var p "Number of unique patents"
label var granted "Number of unique granted patents"
label var triadic "Number of unique triadic patents"
label var mkts "Average number of countries the patents are filed"
label var citations "Average number of citations for the patents of the firm"
label var citations3year "Average number of citations after 3 years for the patents of the firm"
label var num_inventors "Average number of inventors for the patents of the firm"
label var num_ipc "Average number of IPC codes for patents of the firm"
label var num_cpc "Average number of CPC codes for patents of the firm"

merge m:1 hrm using $tmp/hrm_nace, keep(match master)
drop _merge
save patentsfam4, replace
